The goal of this step in the workflow is to compare S2 stripchart stream height values to the stream height values recorded at manual checkpoints during the time period of interest. Manual checkpoints can be used to quality check stream height values being collected by the stripchart.
Install/load necessary R packages
Recreate intermediate data sets for use in this .Rmd file:
Individual water years’ worth of stripchart data
All water years’ worth of stripchart data
#2017
filepath <- "/Users/miaforsline/Library/CloudStorage/Box-Box/External-MEF_DATA/Hydro/Streamflow/L0_subdaily/StripCharts/AnnualBreakPoint/S2"
#read in the 2017 data
wy2017 <- read_excel(path = here(filepath, "WY2017.S2_Breakpoint.xlsx"),
skip =3) #skip the top 3 rows of the Excel Sheet
#clean the 2017 data
wy2017_clean <- wy2017 %>%
#change column names
rename(datetime = "Date/time",
stream_height_ft = "Stage.ft") %>%
#remove unnecessary columns
select(c("datetime", "stream_height_ft")) %>%
#format column names into lower_snake_case
clean_names() %>%
#create a date column
mutate(date = format(as.POSIXct(datetime, format = '%m/%d/%Y %H:%M:%S',
tz = "GMT"),
format = '%Y-%m-%d'),
date = as.POSIXct(date, tz = "GMT"))
#2018
filepath <- "/Users/miaforsline/Library/CloudStorage/Box-Box/External-MEF_DATA/Hydro/Streamflow/L0_subdaily/StripCharts/AnnualBreakPoint/S2"
#create a function to specify custom time intervals
`%between%` <- function(x, interval) x >= interval[1] & x <= interval[2]
#read in the 2018 data
wy2018 <- read_excel(path = here(filepath, "WY2018.S2_Breakpoint.xlsx"),
skip =3)
#clean the 2018 data
wy2018_clean <- wy2018 %>%
rename(datetime = "Date/time",
stream_height_ft = "Stage.ft") %>%
select(c("datetime", "stream_height_ft")) %>%
clean_names() %>%
mutate(date = format(as.POSIXct(datetime, format = '%m/%d/%Y %H:%M:%S',
tz = "GMT"),
format = '%Y-%m-%d'),
date = as.POSIXct(date, tz = "GMT"))
#assign October NA values
wy2018_clean$stream_height_ft[wy2018_clean$date %between% as.Date(c('2018-10-09', '2018-10-10'))] <- NA
wy2018_clean$stream_height_ft[wy2018_clean$date %between% as.Date(c('2018-10-11', '2018-10-15'))] <- NA
#assign November NA values
wy2018_clean$stream_height_ft[wy2018_clean$date %between% as.Date(c('2018-11-10', '2018-11-14'))] <- NA
wy2018_clean$stream_height_ft[wy2018_clean$date %between% as.Date(c('2018-11-25', '2018-12-04'))] <- NA
#assign December NA values
wy2018_clean$stream_height_ft[wy2018_clean$date %between% as.Date(c('2018-12-01', '2018-12-03'))] <- NA
wy2018_clean$stream_height_ft[wy2018_clean$date %between% as.Date(c('2018-12-08', '2018-12-10'))] <- NA
wy2018_clean$stream_height_ft[wy2018_clean$date %between% as.Date(c('2018-12-11', '2018-12-31'))] <- NA
wy2018_altered <- wy2018_clean
wy2018_altered <- subset(wy2018_altered, !(wy2018_altered$date %between% as.Date(c('2018-10-31', '2018-11-01')) & wy2018_altered$stream_height_ft < 0.10))
filepath <- "/Users/miaforsline/Library/CloudStorage/Box-Box/External-MEF_DATA/Hydro/Streamflow/L0_subdaily/StripCharts/AnnualBreakPoint/S2"
#2019
#read in the 2019 data
wy2019 <- read_excel(path = here(filepath, "WY2019.S2_Breakpoint.xlsx"),
skip =3)
#clean the 2019 data
wy2019_clean <- wy2019 %>%
rename(datetime = "Date/time",
stream_height_ft = "Stage.ft") %>%
select(c("datetime", "stream_height_ft")) %>%
clean_names() %>%
mutate(date = format(as.POSIXct(datetime, format = '%m/%d/%Y %H:%M:%S',
tz = "GMT"),
format = '%Y-%m-%d'),
date = as.POSIXct(date, tz = "GMT"))
#assign February NA values
wy2019_clean$stream_height_ft[wy2019_clean$date %between% as.Date(c('2019-04-01', '2019-04-02'))] <- NA
#assign March NA values
wy2019_clean$stream_height_ft[wy2019_clean$date == as.Date('2019-03-14')] <- NA
wy2019_clean$stream_height_ft[wy2019_clean$date == as.Date('2019-03-18')] <- NA
wy2019_clean$stream_height_ft[wy2019_clean$date %between% as.Date(c('2019-03-30', '2019-03-31'))] <- NA
#assign April NA values
wy2019_clean$stream_height_ft[wy2019_clean$date %between% as.Date(c('2019-04-01', '2019-04-02'))] <- NA
wy2019_clean$stream_height_ft[wy2019_clean$date %between% as.Date(c('2019-04-08', '2019-04-17'))] <- NA
wy2019_clean$stream_height_ft[wy2019_clean$date %between% as.Date(c('2019-04-17', '2019-04-22'))] <- NA
#assign September NA values
wy2019_clean$stream_height_ft[wy2019_clean$date %between% as.Date(c('2019-09-03', '2019-09-10'))] <- NA
#2020
filepath <- "/Users/miaforsline/Library/CloudStorage/Box-Box/External-MEF_DATA/Hydro/Streamflow/L0_subdaily/StripCharts/AnnualBreakPoint/S2"
#read in the 2020 data
wy2020 <- read_excel(path = here(filepath, "WY2020.S2_Breakpoint.xlsx"),
skip =3)
#clean the 2020 data
wy2020_clean <- wy2020 %>%
rename(datetime = "Date/time",
stream_height_ft = "Stage.ft") %>%
select(c("datetime", "stream_height_ft")) %>%
clean_names() %>%
mutate(date = format(as.POSIXct(datetime, format = '%m/%d/%Y %H:%M:%S',
tz = "GMT"),
format = '%Y-%m-%d'),
date = as.POSIXct(date, tz = "GMT"))
#create date range to fill in with NA values
start_date <- as.POSIXct("7/28/2020 13:40:00",
format = '%m/%d/%Y %H:%M:%S',
tz = "GMT")
end_date <- as.POSIXct("8/11/2020 00:00:00",
format = '%m/%d/%Y %H:%M:%S',
tz = "GMT")
#create a function to specify custom time intervals
`%between%` <- function(x, interval) x >= interval[1] & x <= interval[2]
#assign February NA values
wy2020_clean$stream_height_ft[wy2020_clean$datetime %between% c(start_date, end_date)] <- NA
#2021
filepath <- "/Users/miaforsline/Library/CloudStorage/Box-Box/External-MEF_DATA/Hydro/Streamflow/L0_subdaily/StripCharts/AnnualBreakPoint/S2"
#read in the 2021 data
wy2021 <- read_excel(path = here(filepath, "WY2021.S2_Breakpoint.xlsx"),
skip =3)
#clean the 2021 data
wy2021_clean <- wy2021 %>%
rename(datetime = "Date/time",
stream_height_ft = "Stage.ft") %>%
select(c("datetime", "stream_height_ft")) %>%
clean_names() %>%
mutate(date = format(as.POSIXct(datetime, format = '%m/%d/%Y %H:%M:%S',
tz = "GMT"),
format = '%Y-%m-%d'),
date = as.POSIXct(date, tz = "GMT"))
#identify date intervals that need to be filled w/NAs
date1 <- as.POSIXct("2021-03-04 00:00:00",
format = '%Y-%m-%d %H:%M:%S',
tz = "GMT")
date2 <- as.POSIXct("2021-03-09 00:00:00",
format = '%Y-%m-%d %H:%M:%S',
tz = "GMT")
date3 <- as.POSIXct("2021-11-11 23:59:00",
format = '%Y-%m-%d %H:%M:%S',
tz = "GMT")
date4 <- as.POSIXct("2021-12-29 00:00:00",
format = '%Y-%m-%d %H:%M:%S',
tz = "GMT")
#create a function to specify custom time intervals
`%between%` <- function(x, interval) x >= interval[1] & x <= interval[2]
#assign March NA values
wy2021_clean$stream_height_ft[wy2021_clean$datetime %between% c(date1, date2)] <- NA
#assign Nov-Dec NA values
wy2021_clean$stream_height_ft[wy2021_clean$datetime %between% c(date3, date4)] <- NA
#all water years
all_streamflow <- rbind(wy2017_clean,
wy2018_altered,
wy2019_clean,
wy2020_clean,
wy2021_clean)
min_year = 2017
max_year = 2021
External-MEF_DATA/Hydro/Streamflow/L0_subdaily/ManualChecksclean_names()collected column from a character into a
POSIXct date format#create file path to call the data from Box
## Mia's file path
filepath <- "/Users/miaforsline/Library/CloudStorage/Box-Box/External-MEF_DATA/Hydro/Streamflow/L0_subdaily/ManualChecks"
#read in the manual checks data
mc <- read_csv(here(filepath, "2017-2021_S2Stage.csv"))
#clean the data
mc_clean <- mc %>%
clean_names() %>%
#remove S2 lagging pool data and keep only the S2 weir data
subset(name == "S2 WEIR") %>%
mutate(collected = as.POSIXct(collected, format = '%m/%d/%Y %H:%M',
tz = "GMT"),
year = format(as.POSIXct(collected, format = '%Y-%m-%d %H:%M:%S',
tz = "GMT"),
format = '%Y'),
date = format(as.POSIXct(collected, format = '%Y-%m-%d %H:%M:%S',
tz = "GMT"),
format = '%Y-%m-%d'),
diff = point_gage - stripchart_stage
) %>%
subset(year >= min_year & year <= max_year)
#create subsets of the data where NA values are removed from a col of interest
#point check data
pc_clean <- mc_clean %>%
#remove rows with NA values in the point_gage col
subset(!is.na(point_gage))
#stripchart check data
sc_clean <- mc_clean %>%
#remove rows with NA values in the stripchart_stage col
subset(!is.na(stripchart_stage))
#format data for plotting
mc_plot <- mc_clean %>%
select(-logger_stage,
-year,
-date,
-site,
-name) %>%
pivot_longer(cols = c("point_gage", "stripchart_stage"),
names_to = "type",
values_to = "stream_height_ft")
#sample size
n <- nrow(mc_plot)
ggplot(data = mc_plot) +
geom_point(aes(x = collected,
y = stream_height_ft,
color = type),
alpha = 0.5) +
theme_classic() +
labs(y = "Stream Height (ft)",
x = "Time",
title = paste0("S2 Manual Checkpoints VS \n Stripchart Values Over Time (", min_year, "-", max_year, ")"),
color = "Checkpoint Type",
caption = paste0("n = ", n)
) +
theme(plot.title = element_text(hjust = 0.5)) +
scale_color_discrete(labels = c("Point Gage", "Stripchart Stage"))
point_gage - stripchart_stage#sample size
n <- nrow(mc_plot)
ggplot(data = mc_plot) +
geom_point(aes(x = collected, y = diff)) +
geom_smooth(aes(x = collected, y = diff), method = "lm") +
theme_classic() +
labs(y = "Stream Height (ft)",
x = "Point Gage - Stripchart Stage",
title = paste0("S2 Manual Checkpoints VS \n Stripchart Differences Over Time (", min_year, "-", max_year, ")"),
caption = paste0("n = ", n)
) +
theme(plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5)) +
scale_color_discrete(labels = c("Point Gage", "Stripchart Stage"))
For interactive plots,
stripchart stream height (ft) is plotted as a black line, and
manual checkpoints of stream height (ft) are plotted as red dots.
#plot
p_all <- ggplot() +
#stripchart streamflow data
geom_line(data = all_streamflow,
aes(x = datetime,
y = stream_height_ft,
group = 1,
#create hovering labels for interactive graph
text = paste0("DateTime: ", datetime, "\n",
"Stream Height (ft): ", round(stream_height_ft, digits = 3))),
size = 0.25) +
#manual checkpoints
geom_point(data = mc_clean,
aes(x = collected,
y = point_gage,
group = 1,
#create hovering labels for interactive graph
text = paste0("DateTime: ", collected, "\n",
"Stream Height Checkpoint (ft): ", round(stripchart_stage, digits = 3))),
color = "red",
size = 0.5) +
theme_classic() +
labs(x = "Time",
y = "Stream Height (ft)",
title = paste0("S2 Bog Stream Height (", min_year, "-", max_year, ")"
),
subtitle = "Stripchart data are plotted as black lines. Manual checks are plotted as red dots.") +
theme(plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5))
#static plot
#p_all
#interactive plot
ggplotly(p_all, tooltip = "text")
Next, we are interested in a 1:1 comparison of stripchart data vs
manual checkpoints data at the exact same timestamp. Since the
stripchart data and manual checkpoints do not align perfectly, we will
interpolate the stripchart stream flow values using the
na.approx() function from the zoo package to
estimate stripchart values at the time of the manual checks.
Clean the data
Note that the manual checks data only range from 2017-04-04 to 2019-12-31, which is a smaller time range than the stripchart data
#clean the stripchart_stage data (using the point_gage col)
sc_sub <- sc_clean %>%
#extract the data (without the timestamp)
mutate(date = format(as.POSIXct(collected, format = '%m/%d/%Y %H:%M:%S',
tz = "GMT"),
format = '%Y-%m-%d'),
date = as.POSIXct(date, tz = "GMT"),
year = as.numeric(year)
) %>%
#rename column
rename(datetime = collected) %>%
#remove unnecessary columns
select(-site, -lab_id, -name, -point_gage, -logger_stage, -diff)
#identify date ranges of interest:
##aka the range of the manual checkpoint data
max_date <- max(sc_sub$date)
min_date <- min(sc_sub$date)
#subset stripchart data to fit within the time range of the manual checkpoints
streamflow_sub <- all_streamflow %>%
mutate(
year = format(as.POSIXct(datetime, format = '%m/%d/%Y %H:%M:%S',
tz = "GMT"),
format = '%Y'),
year = as.numeric(year)
) %>%
subset(date <= max_date & date >= min_date)
#create dummy dataset (which is a subset of the data)
test <- full_join(x = sc_sub,
y = streamflow_sub,
by = c("datetime", "date", "year")) %>%
arrange(datetime)
x <- test[13:17, ]
#approximate linearly
y <- x %>%
mutate(approx = na.approx(stream_height_ft,
na.rm =FALSE,
method = "linear",
maxgap = 6)
)
# #plot
# ggplot(data = y, aes(x = datetime, y = approx)) +
# geom_line() +
# geom_point()
z <- full_join(x = sc_sub,
y = streamflow_sub,
by = c("datetime", "date", "year")) %>%
arrange(datetime) %>%
mutate(approx = na.approx(stream_height_ft,
na.rm =FALSE,
method = "linear",
maxgap = 6)
)
Join the manual checkpoints (sc_sub) dataset with
the continuous stripchart dataset (streamflow_sub)
Optional: Plot the data to visually examine if the approximated values generated by na.approx() look correct
#join the stripchart data and manual checks data
fj_sc <- full_join(x = sc_sub,
y = streamflow_sub,
by = c("datetime", "date", "year")) %>%
#order by datetime
arrange(datetime) %>%
#interpolate to fill in missing NA values
mutate(approx = na.approx(stream_height_ft,
na.rm = FALSE,
#x = datetime,
#xout = datetime,
method = "linear",
maxgap = 6),
#calculate the difference between approximated values and real values
diff1 = approx - stream_height_ft, #should be zero
diff2 = approx - stripchart_stage,
diff = coalesce(diff1, diff2))
#test the full join did not drop any observations
if(nrow(fj_sc) != (nrow(sc_sub) + nrow(streamflow_sub))) stop("Check fj_sc dataframe dimensions")
#test that diff1 = 0 at all times
x <- fj_sc[!is.na(fj_sc$diff1),]
if(sum(x$diff1) != 0) stop("approx - stream_height_ft should be 0")
#test that diff2 != 0
y <- fj_sc[!is.na(fj_sc$diff2),]
if(sum(y$diff2) == 0) stop("approx - stripchart_stage should not be 0")
# fj_sc_long <- fj_sc %>%
# #rearrange dataframe into a long format
# pivot_longer(cols = c("stripchart_stage", "stream_height_ft", "approx"),
# names_to = "types",
# values_to = "stream_height_ft")
##visually examine the approximated values
# ggplot(data = fj_sc) +
# geom_line(aes(x = datetime, y = approx))
#
# ggplot(data = fj_sc) +
# geom_point(aes(x = datetime,
# y = stream_height_ft,
# color = types))
#
# ggplot(data = fj_sc) +
# geom_point(aes(x = datetime,
# y = stream_height_ft,
# color = types)) +
# geom_point(aes(x = datetime,
# y = approx),
# alpha = 0.05)
# fj_sc_wide <- fj_sc %>%
# #remove extraneous columns
# select(-stream_height_ft, -diff) %>%
# #return to wide format to create scatterplot
# pivot_wider(
# names_from = "types",
# values_from = "approx"
# ) %>%
# #unlist the columns created by pivot_wider()
# unnest()
# #visually examine all data
# ggplot(data = fj_sc) +
# geom_point(aes(x = manual_check,
# y = stripchart))
Subset the joined dataframe to include only the timestamps of interest (AKA the timestamps from the original stripchart_stage dataset)
Note there will be some values that could not be approximated due to lack of stripchart data
#left join the joined data and stripchart_stage data to keep only the timestamps of interest
lj_sc <- left_join(x = sc_sub,
y = fj_sc,
by = c("datetime", "year", "stripchart_stage", "date")) %>%
#ensure the correct column types
mutate(stripchart_stage = as.numeric(stripchart_stage),
stream_height_ft = as.numeric(stream_height_ft),
approx = as.numeric(approx)
) %>%
#remove duplicates
unique()
#test if the subsetted data has the same number of observations as the manual checks dataframe
#if(nrow(lj) != (nrow(pc_sub))) stop("Check lj dataframe dimensions")
#test if the approximated data differs greatly from the stripchart/manual checks data
#if(abs(lj$diff) > 0.01 ) stop("Check differences")
#sample size
n <- nrow(lj_sc)
#plot
p_1to1 <- ggplot() +
geom_point(data = lj_sc,
aes(x = stripchart_stage, #from manual checkpoints dataset
y = approx, #continuous stripchart values
group = 1,
#create hovering labels for interactive graph
text = paste0("Stripchart Stage (ft): ", stripchart_stage, "\n",
"Interpolated Values (ft): ", round(approx, digits = 3))),
alpha = 0.4) +
theme_classic() +
labs(x = "Stripchart Stage",
y = "Interpolated Values",
title = paste0("S2 Stripchart vs Interpolated Values (", min_year, "-", max_year, ")"),
subtitle = "The line y = x is plotted for reference.") +
theme(plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5)) +
geom_abline(slope = 1, intercept = 0)
#static plot
#p_1to1
#interactive plot
ggplotly(p_1to1, tooltip = "text") %>%
layout(
title = list(text = paste0("S2 Stripchart Stage vs Interpolated Values (", min_year, "-", max_year, ")",
'<br>',
'<sup>',
#subtitle
paste0("n = ", n),
'</sup>')),
#title size
font = list(size = 14))
#clean the point gage data
pc_sub <- pc_clean %>%
#extract the data (without the timestamp)
mutate(date = format(as.POSIXct(collected, format = '%m/%d/%Y %H:%M:%S',
tz = "GMT"),
format = '%Y-%m-%d'),
date = as.POSIXct(date, tz = "GMT"),
year = as.numeric(year)
) %>%
#rename column
rename(datetime = collected) %>%
#remove unnecessary columns
select(-site,
-lab_id,
-name,
-stripchart_stage,
-logger_stage,
-diff)
#identify date ranges of interest: 2017-04-04 to 2019-12-31
##aka the range of the manual checkpoint data
max_date <- max(pc_sub$date)
min_date <- min(pc_sub$date)
nrows <- nrow(pc_sub)
# #subset stripchart data to fit within the time range of the manual checkpoints
# streamflow_sub <- all_streamflow %>%
# mutate(
# year = format(as.POSIXct(datetime, format = '%m/%d/%Y %H:%M:%S',
# tz = "GMT"),
# format = '%Y'),
# year = as.numeric(year)
# ) %>%
# subset(date <= max_date & date >= min_date)
Join the manual checks dataset with the stripchart dataset
Plot the data to visually examine if the approximated values generated by na.approx() look correct
#join the stripchart data and manual checks data
fj_pc <- full_join(x = pc_sub,
y = streamflow_sub,
by = c("datetime", "date", "year")) %>%
#rename columns
rename(manual_check = point_gage,
stripchart = stream_height_ft) %>%
#interpolate to fill in missing NA values
mutate(approx = na.approx(object = stripchart,
x = datetime,
method = "linear",
maxgap = 6))
# #rearrange dataframe into a long format
# pivot_longer(cols = c("manual_check", "stripchart"),
# names_to = "types",
# values_to = "stream_height_ft") %>%
# #visually examine the approximated values
# ggplot(data = fj_pc) +
# geom_line(aes(x = datetime, y = approx))
#
# ggplot(data = fj_pc) +
# geom_point(aes(x = datetime,
# y = stream_height_ft,
# color = types))
#
# ggplot(data = fj_pc) +
# geom_point(aes(x = datetime,
# y = stream_height_ft,
# color = types)) +
# geom_point(aes(x = datetime,
# y = approx),
# alpha = 0.05)
fj_pc <- fj_pc %>%
#remove extraneous columns
select(-stripchart)
# %>%
# #return to wide format to create scatterplot
# pivot_wider(
# names_from = "types",
# values_from = "approx"
# ) %>%
# #unlist the columns created by pivot_wider()
# unnest
# #visually examine all data
# ggplot(data = fj_pc) +
# geom_point(aes(x = manual_check,
# y = stripchart))
Subset the joined dataframe to include only the timestamps of interest (AKA the timestamps from the original manual checks dataset)
Plot
#left join the joined data and manual checks data to keep only the timestamps of interest
lj_pc <- left_join(x = pc_sub,
y = fj_pc,
by = c("datetime", "year", "date", "point_gage" = "manual_check")) %>%
#ensure the correct column types
mutate(point_gage = as.numeric(point_gage),
approx = as.numeric(approx),
diff = point_gage - approx)
#test if the subsetted data has the same number of observations as the manual checks dataframe
#if(nrow(lj_pc) != (nrow(pc_sub))) stop("Check lj dataframe dimensions")
#test if the approximated data differs greatly from the stripchart/manual checks data
#if(abs(lj_pc$diff) > 0.01 ) stop("Check differences")
#plot
p_1to1 <- ggplot() +
geom_point(data = lj_pc,
aes(x = point_gage,
y = approx,
group = 1,
#create hovering labels for interactive graph
text = paste0("Manual Checkpoint (ft): ", point_gage, "\n",
"Stripchart (ft): ", approx)),
alpha = 0.5) +
theme_classic() +
labs(x = "Manual Checkpoints",
y = "Stripchart Data",
title = paste0("S2 Manual Checks vs Interpolated Stripchart Values (", min_year, "-", max_year, ")"),
subtitle = "The line y = x is plotted for reference.") +
theme(plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5)) +
geom_abline(slope = 1, intercept = 0) +
xlim(0, 0.4) +
ylim(0, 0.4)
#static plot
#p_1to1
#interactive plot
ggplotly(p_1to1, tooltip = "text")
Run this code chunk to knit to HTML without using the knit button in RStudio
index.html to render GitHub PageNote that this file is being knit as index.html
into the manual_checks
repository in order to update the GitHub pages
website, where the plots can be viewed online.